import pandas as pd
import numpy as np
import json
import os
import nltk
from nltk.classify import textcat
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
import wordcloud
from nltk.corpus import stopwords
from nltk import FreqDist
from wordcloud import wordcloud,STOPWORDS
from nltk.sentiment import SentimentIntensityAnalyzer
# Functions
def null_percentage(df):
return round((df.isnull().sum() / len(df)) * 100,2)
def jason_to_dataframe(file_name):
with open(file_name,"r",encoding="utf-8") as data_file:
data = [json.loads(line) for line in data_file]
return pd.DataFrame(data)
os.chdir("D:\\Data\\yelp_dataset")
os.listdir()
['Dataset_User_Agreement.pdf', 'yelp_academic_dataset_business.json', 'yelp_academic_dataset_checkin.json', 'yelp_academic_dataset_review.json', 'yelp_academic_dataset_tip.json', 'yelp_academic_dataset_user.json']
business_df = jason_to_dataframe("yelp_academic_dataset_business.json")
review_df = jason_to_dataframe("yelp_academic_dataset_review.json")
review_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6990280 entries, 0 to 6990279 Data columns (total 9 columns): # Column Dtype --- ------ ----- 0 review_id object 1 user_id object 2 business_id object 3 stars float64 4 useful int64 5 funny int64 6 cool int64 7 text object 8 date object dtypes: float64(1), int64(3), object(5) memory usage: 480.0+ MB
business_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150346 entries, 0 to 150345 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 business_id 150346 non-null object 1 name 150346 non-null object 2 address 150346 non-null object 3 city 150346 non-null object 4 state 150346 non-null object 5 postal_code 150346 non-null object 6 latitude 150346 non-null float64 7 longitude 150346 non-null float64 8 stars 150346 non-null float64 9 review_count 150346 non-null int64 10 is_open 150346 non-null int64 11 attributes 136602 non-null object 12 categories 150243 non-null object 13 hours 127123 non-null object dtypes: float64(3), int64(2), object(9) memory usage: 16.1+ MB
cls = textcat.TextCat()
review_df['text'][1:10].apply(lambda x : cls.guess_language(x))
1 eng 2 eng 3 eng 4 eng 5 eng 6 eng 7 eng 8 eng 9 eng Name: text, dtype: object
categories = business_df['categories'].str.lower().str.split(",",expand=True).stack().to_frame().reset_index()
categories = categories.drop(['level_0','level_1'],axis=1)
categories = categories.rename(columns={0:'Cat_Name'})
categories['Cat_Name'] = categories['Cat_Name'].str.strip()
categories = categories.value_counts().reset_index(name='count')
categories = categories.head(10)
plt.figure(figsize=(15,8))
sns.barplot(x='Cat_Name',y='count',data = categories,palette='rocket')
plt.xlabel("Name of the Business Category")
plt.ylabel("Count")
plt.title("Top 10 Categories Of Business")
plt.show()
plt.figure(figsize=(15,8))
city = business_df['city'].str.lower().value_counts()[:10].reset_index(name="count").rename(columns={'index' : 'City'})
sns.barplot(x='City',y='count',data=city,palette='rocket')
plt.ylabel("Count")
plt.xlabel("City Name")
plt.title("Top Ten Cities with the most business parties in Yelp")
plt.show()
philadelphia_business = business_df[business_df['city'].str.lower() == 'philadelphia']
color_scale = [(0, 'orange'), (1,'red')]
fig = px.scatter_mapbox(philadelphia_business,
lat="latitude",
lon="longitude",
hover_name="name",
hover_data=["name", "review_count","stars"],
color="review_count",
color_continuous_scale=color_scale,
size="stars",
zoom=8,
height=800,
width=800)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
top_10_business_review = review_df[review_df['stars'] == 5]['business_id'].value_counts().to_frame().reset_index().rename(columns={'business_id' : 'Count','index' : 'business_id'})[:10]
top_10_business_review = pd.merge(top_10_business_review,business_df,on='business_id')[['Count','name']]
plt.figure(figsize=(25,8))
sns.barplot(x='name',y='Count',data=top_10_business_review,palette='rocket')
plt.ylabel("Count")
plt.xlabel("Business Name")
plt.title("Name of the business and Count")
plt.show()
business_df[business_df['name'] == 'Reading Terminal Market'][['name','city','state','categories','postal_code']]
| name | city | state | categories | postal_code | |
|---|---|---|---|---|---|
| 143157 | Reading Terminal Market | Philadelphia | PA | Candy Stores, Shopping, Department Stores, Fas... | 19107 |
Reading_Terminal_Market_review = review_df[review_df['business_id'] == 'ytynqOUb3hjKeJfRj5Tshw']
Reading_Terminal_Market_review_useful = Reading_Terminal_Market_review['useful'].value_counts().head(10).to_frame().reset_index().rename(columns={'useful' : 'Count','index' : 'useful'})
plt.figure(figsize=(15,5))
sns.barplot(x='useful',y='Count',data=Reading_Terminal_Market_review_useful,palette='rocket')
plt.ylabel("Count")
plt.xlabel("Useful Review")
plt.title("Useful Review and Count")
plt.show()
Reading_Terminal_Market_review_funny = Reading_Terminal_Market_review['funny'].value_counts().head(10).to_frame().reset_index().rename(columns={'funny' : 'Count','index' : 'funny'})
plt.figure(figsize=(15,5))
sns.barplot(x='funny',y='Count',data=Reading_Terminal_Market_review_funny,palette='rocket')
plt.ylabel("Count")
plt.xlabel("funny Review")
plt.title("funny Review and Count")
plt.show()
Reading_Terminal_Market_review_cool = Reading_Terminal_Market_review['cool'].value_counts().head(10).to_frame().reset_index().rename(columns={'cool' : 'Count','index' : 'cool'})
plt.figure(figsize=(15,5))
sns.barplot(x='cool',y='Count',data=Reading_Terminal_Market_review_cool,palette='rocket')
plt.ylabel("Count")
plt.xlabel("cool Review")
plt.title("cool Review and Count")
plt.show()
text = str(Reading_Terminal_Market_review['text'].values)
tokenizer = nltk.RegexpTokenizer(r"\w+")
rp_word = tokenizer.tokenize(text)
filtered_words = [word for word in rp_word if word not in stopwords.words('english')]
fdist_words = FreqDist(filtered_words)
plt.figure(figsize=(20,5))
plt.scatter(*zip(*fdist_words.most_common(20)))
plt.show()
stop_w = STOPWORDS
wordcloud.WordCloud(background_color='white',stopwords=stop_w).generate(text).to_image()
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("I Hate You!!!")
{'neg': 0.821, 'neu': 0.179, 'pos': 0.0, 'compound': -0.6784}
Reading_Terminal_Market_review['Negative'] = Reading_Terminal_Market_review['text'].apply(lambda x : sia.polarity_scores(x).get('neg'))
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\818546771.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Reading_Terminal_Market_review['Positive'] = Reading_Terminal_Market_review['text'].apply(lambda x : sia.polarity_scores(x).get('pos'))
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\4244575143.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Reading_Terminal_Market_review['Compound'] = Reading_Terminal_Market_review['text'].apply(lambda x : sia.polarity_scores(x).get('compound'))
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\699295148.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
def polarity_score(compound):
if compound > 0.05:
return "Positive"
elif compound < -0.05:
return "Negative"
elif compound >= -0.05 and compound < 0.05:
return "Neutral"
Reading_Terminal_Market_review['Sentiment'] = Reading_Terminal_Market_review['Compound'].apply(lambda x : polarity_score(x))
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\700839630.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Reading_Terminal_Market_review['Sentiment'].value_counts()
Positive 5371 Negative 287 Neutral 120 Name: Sentiment, dtype: int64